In [1]:
# import labriares
import os 
import json 
import nltk
import random
import string
import pickle
import collections
import numpy as np
from random import seed
from random import randint
from datetime import datetime
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

### Load & Clean the Data

In [2]:
def load_data(descriptions,dir_path):
    
    counter = 0
    counter_issues = 0
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            ##############################
            counter += 1
            #print(counter,") reading file",fname)
            ##############################
            
            #load data in json format
            data = json.load(json_file)
            for p in data:
                
                ##############################
                issue_name     = p['name']
                counter_issues += 1
                #print("  ",counter_issues,")",issue_name)
                ##############################
                
                issue_desc     = p['description']
                
                # add all non empty issues and non dublicate.
                if issue_desc != [] and issue_desc not in descriptions:
                    descriptions.append(issue_desc)


In [3]:
def clean_data(clean_descriptions,raw_descriptions):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    # define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    for desc in raw_descriptions:
        
        #join all lines into one sentence
        sentence = ' '.join(desc)
        
        #translate punctuation
        new_sentence = sentence.translate(translator)
        
        #split the sentense in words
        words = new_sentence.split()
        words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
        
        if words_sw != []:
            clean_descriptions.append(words_sw)
        

In [4]:
def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
            

In [5]:
def split_dataset(descriptions,valid_size,test_size,min_size):
    
    valid_set = []
    test_set  = []
    
    # random select descriptions.
    seed(datetime.now())
    
    for i in range(valid_size):
        flag = False
        while flag == False:
            temp = randint(0,len(descriptions)-1)
            if len(descriptions[temp]) >= min_size:
                valid_set.append(descriptions.pop(temp))
                flag = True
    
    for i in range(test_size):
        flag = False
        while flag == False:
            temp = randint(0,len(descriptions)-1)
            if len(descriptions[temp]) >= min_size:
                test_set.append(descriptions.pop(temp))
                flag = True
    
    return valid_set,test_set

In [6]:
#the first time the below command should run to download stopwords
#nltk.download('stopwords')

# define necessary parameters
dir_path         = '/home/kostas/Documents/thesis/data_1'
raw_descriptions = []
min_size         = 10

# load all issues descriptions
load_data(raw_descriptions,dir_path)

# split and clean descriptions
clean_descriptions = []
clean_data(clean_descriptions,raw_descriptions)

# list raw_descriptions now is useless
del raw_descriptions

# stemming, it's not necessary step.
stemming_data(clean_descriptions)

# split data set to train,validation and test set
# validation and test set would have 20% of total data.
total_desc = len(clean_descriptions)
valid_size = int(0.2 * total_desc)
test_size  = int(0.2 * total_desc)

valid_set,test_set = split_dataset(clean_descriptions,valid_size,test_size,min_size)

In [7]:
# print messages #
print("total unique descriptions",total_desc)
print("size of train set",len(clean_descriptions))
print("size of validation set",valid_size)
print("size of test set",test_size)

total unique descriptions 5973
size of train set 3585
size of validation set 1194
size of test set 1194


### Create Vocabulary

In [8]:
# define some important variables
valid_words   = 100
test_words    = 100
true_neigh    = 8
false_neigh   = 30

min_occurance = 5
unk_word      = "UNK"
skip_window   = 4
batch_size    = 2048
embedding_dim = 64
num_sampled   = 32
learning_rate = 0.1

In [9]:
def save_vocabulary(word_dict):
    
    with open("vocabulary_test.txt","w") as file:
        for key in word_dict:
            file.write("%s, %s \n"%(key,str(word_dict[key])))

In [10]:
def create_corpus(train_set,skip_window):
    
    #find total words in descriptions
    total_words = 0
    for desc in train_set:
        total_words += len(desc)
    
    #initialize the corpus which will keep all word pairs
    max_size = total_words*2*skip_window
    corpus = -1*np.ones((max_size,2), dtype=np.int32)
    
    # initialize pointers for the iterations
    desc_pointer  = 0
    word_pointer  = 0
    counter       = 0
    
    #initialize temporary buffer
    span   = 2*skip_window+1 
    buffer = collections.deque(maxlen = span)
    
    while counter < max_size:
        
        # avoid tags with -2
        while train_set[desc_pointer][word_pointer] < 0:
            word_pointer += 1
            if word_pointer > len(train_set[desc_pointer])-1:
                word_pointer  = 0
                desc_pointer +=1
                if desc_pointer > len(train_set) -1:
                    break
                    
        #check if all descriptions have been analyzed
        if desc_pointer > len(train_set)-1:
            break
        
        find_context_words(train_set[desc_pointer],word_pointer,skip_window,span,buffer)
        
        for i in range(1,len(buffer)):
            corpus[counter][0] = buffer[0]
            corpus[counter][1] = buffer[i]
            counter += 1
        
        buffer.clear()
        
        if word_pointer == len(train_set[desc_pointer]) -1:
            word_pointer  = 0
            desc_pointer +=1
            if desc_pointer > len(train_set) -1:
                break
        else:
            word_pointer += 1
    
    return corpus[0:counter].copy()

In [11]:
def find_context_words(description,word_index,skip_window,span,grams_list):
    
    # the target word in the first place
    grams_list.append(description[word_index])
    
    # initialize two pointers
    counter = 1
    data_index = word_index-1
    
    while counter < span:
        # look left from target word
        if counter<=skip_window:
            # if data_index<0 => out of bound no more words to take into account
            if data_index < 0:
                data_index = word_index + 1
                counter = skip_window + 1
            # if the word is not in the dict skip it
            elif description[data_index] == -2:
                data_index -= 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index -= 1
                if counter > skip_window:
                    data_index = word_index + 1
        # look right from target word
        else:
            if data_index >= len(description):
                counter = span + 1
            elif description[data_index] == -2:
                data_index += 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index += 1
    
    

In [12]:
# create vocabulary based on the frequency of each word.
# remove rare words, which occurs less time than min_occurance from voc
# word2id:  dictionary which contains the vocabulary and it's int id

temp_sentences = [word for desc in clean_descriptions for word in desc]

count = []
count.extend(collections.Counter(temp_sentences).most_common())

# list temp_sentences now is useless
del temp_sentences

count[:] = [e for e in count if e[1]>=min_occurance]

# compute the vocabulary size
vocabulary_size = len(count)

# assign an id to each word
# this dictionary will have voc_size+1 length.
word2id = dict()
word2id[unk_word] = -2

for i,(word,_) in enumerate(count):
    word2id[word] = i

# list count now is useless
del count

#express train, valid and test set using id
train_set_id = [[word2id.get(word,-2) for word in desc] for desc in clean_descriptions]
del clean_descriptions
valid_set_id = [[word2id.get(word,-2) for word in desc] for desc in valid_set]
del valid_set
test_set_id  = [[word2id.get(word,-2) for word in desc] for desc in test_set]
del test_set

# save vocabulary
save_vocabulary(word2id)

# create corpus with word pairs
corpus         = create_corpus(train_set_id,skip_window)
corpus_indexes = [w for w in range(len(corpus))] 

# save them 
np.savetxt('corpus_words_test.txt',corpus,fmt="%d")

# train_set_id now is useless
del train_set_id

### Validation and Test Pairs Creation

In [13]:
def save_test_pairs(test_dict):
    
    with open('testing_pairs_test.pkl','wb') as file:
        pickle.dump(test_dict,file,pickle.HIGHEST_PROTOCOL)
        

In [14]:
def partition(arr,low,high):
    
    i = (low - 1)
    pivot = arr[high][1]
    
    for j in range(low,high):
        
        if arr[j][1] >= pivot:
            
            i += 1
            arr[i], arr[j] = arr[j], arr[i]
    
    arr[i+1],arr[high] = arr[high], arr[i+1]
    return (i+1)

In [15]:
def quickSort(arr,low,high):
    if len(arr) ==1:
        return arr
    if low<high:
        pi = partition(arr,low,high)
        
        quickSort(arr,low,pi-1)
        quickSort(arr,pi+1,high)
    

In [24]:
def create_testing_pairs(test_set,min_occurance,num_words,skip_window,true_neigh,false_neigh):
    
    # initialize temporary buffer
    span   = skip_window*2+1
    buffer = collections.deque(maxlen = span)
    
    # numerate all words in the dataset.
    temp_sentences = [word for desc in test_set for word in desc]
    count = []
    count.extend(collections.Counter(temp_sentences).most_common())
    
    # list temp_sentences now is useless
    del temp_sentences
    
    # remove rare words
    count[:] = [e for e in count if e[1]>=min_occurance]
    
    # compute weights and select num_testing words
    weights      = [e[1] for e in count if e[0] != -2 ]
    total_weight = np.sum(weights)
    
    
    # generate random samples    
    weights[:] = [x/total_weight for x in weights]
    indexes    = [i for i in range(len(count)) if count[i][0]!=-2]
    samples    = np.random.choice(indexes,num_words,replace = False, p = weights)
    
    target_w   = [count[i][0] for i in samples]
    w_dict     = dict([(key, [[],[]]) for key in target_w])
    
    
    for desc in test_set:
        for w in target_w:
            temp_idx = [i for i,e in enumerate(desc) if e == w]
            for idx in temp_idx:
                find_context_words(desc,idx,skip_window,span,buffer)
                for i in range(1,len(buffer)):
                    if w_dict[w][0] == []:
                        w_dict[w][0].append([buffer[i],1])
                    else:
                        flag = False
                        for neigh in w_dict[w][0]:
                            if neigh[0] == buffer[i]:
                                neigh[1] += 1
                                flag = True
                        if flag == False:
                            w_dict[w][0].append([buffer[i],1])
                buffer.clear()
                
    # sort lists based on the frequency of neighbor's appearences. Keep only the 8 most frequent.
    # create false_neigh false neighbors for each target word.
    for key in w_dict:
        quickSort(w_dict[key][0],0,len(w_dict[key][0])-1)
        
        flag = True
        neig_counter = 0
        while flag == True:
            random_idx = np.random.choice(indexes,2*false_neigh,replace = False)
            for idx in random_idx:
                
                if count[idx][0] == key:
                    continue
                elif count[idx][0] in w_dict[key][1]:
                    continue
                else:
                    is_flag = True
                    #search neighbors
                    for neigh in w_dict[key][0]:
                        if count[idx][0] == neigh[0]:
                            is_flag = False
                            break
                    if is_flag == True:
                        w_dict[key][1].append(count[idx][0])
                        neig_counter += 1
                        if neig_counter >= false_neigh:
                            flag = False
                            break
        # if len(w_dict[key][0])>8:
        #    w_dict[key][0]= [w_dict[key][0][i] for i in range(8)]
    #for key in w_dict:
    #    print(key,w_dict[key][0])
    print(0,w_dict[0][0])
    return w_dict

In [25]:
#test_dict  = create_testing_pairs(test_set_id,min_occurance,test_words,2,true_neigh,false_neigh)
#save_test_pairs(test_dict)
#del test_dict


valid_dict = create_testing_pairs(valid_set_id,min_occurance,valid_words,2,true_neigh,false_neigh)

0 [[5, 171], [0, 140], [1, 121], [14, 114], [32, 104], [9, 103], [4, 81], [31, 80], [20, 79], [52, 66], [24, 65], [15, 65], [117, 55], [2, 45], [61, 45], [16, 43], [3, 41], [65, 38], [67, 38], [45, 33], [28, 31], [71, 31], [35, 31], [69, 31], [84, 30], [30, 29], [40, 28], [7, 27], [86, 26], [68, 26], [34, 26], [124, 26], [10, 26], [23, 25], [55, 25], [6, 25], [97, 24], [143, 24], [92, 24], [192, 24], [172, 23], [49, 23], [458, 22], [72, 22], [70, 22], [121, 22], [12, 22], [88, 22], [62, 21], [37, 21], [51, 21], [329, 21], [58, 20], [152, 20], [64, 20], [139, 20], [107, 19], [13, 19], [74, 19], [150, 17], [18, 17], [48, 17], [216, 17], [116, 17], [75, 16], [491, 16], [38, 16], [33, 16], [29, 16], [81, 16], [619, 16], [288, 16], [125, 15], [83, 15], [50, 15], [134, 15], [56, 15], [63, 15], [80, 14], [25, 14], [98, 14], [268, 13], [19, 13], [101, 13], [195, 13], [193, 13], [156, 13], [251, 13], [99, 13], [512, 13], [76, 13], [265, 13], [11, 12], [343, 12], [167, 12], [283, 12], [170, 12],

## Word's Embeddings Model Definition and Training

In [16]:
import math
import time
import numpy as np
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()

In [161]:
def generate_batch(corpus_data,corpus_indexes,batch_size):
    
    batch  = np.ndarray(shape = (batch_size),   dtype = np.int32)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    seed(datetime.now())
    
    words_to_use = random.sample(corpus_indexes,batch_size)
    
    for counter,value in enumerate(words_to_use):
        batch[counter]    = corpus_data[value][0]
        labels[counter,0] = corpus_data[value][1] 
    
    return batch,labels

In [162]:
def model_def_cpu(corpus_data,corpus_indexes,batch_size,embedding_dim,
                  num_sampled,learning_rate,vocabulary_size):
    
    # Input data
    X_train = tf.placeholder(tf.int32, shape=[None])
    # Input label
    Y_train = tf.placeholder(tf.int32, shape=[None, 1])
    
    # ensure that the following ops & var are assigned to CPU
    with tf.device('/cpu:0'):
        
        # create the embedding variable wich contains the weights
        embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
        
        # create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
        X_embed   = tf.nn.embedding_lookup(embedding,X_train)
        
        # create variables for the loss function
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_dim],stddev=1.0))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        
    loss_func = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases,labels = Y_train,
                                              inputs = X_embed,num_sampled = num_sampled,
                                              num_classes = vocabulary_size ))
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    
    train_opt = optimizer.minimize(loss_func)
    
    #Define initializer for tensorflow variables
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        #actual initialize the variables
        sess.run(init)
        
        # patience method's variables 
        min_loss           = float('inf')
        min_emb_matrix     = np.zeros((vocabulary_size,embedding_dim))
        patience_remaining = 100
        
        start_time = time.time()
        #train the model using 100 epoch patience
        for epoch in range(50000):
            
            # take a batch of data.
            batch_x,batch_y = generate_batch(corpus_data,corpus_indexes,batch_size)
            
            _,loss = sess.run([train_opt,loss_func],feed_dict={X_train:batch_x, Y_train:batch_y})
            
            patience_remaining -= 1
            
            if loss < min_loss:
                min_loss           = loss
                patience_remaining = 200
                min_emb_matrix     = embedding.eval()
                
            if patience_remaining == 0:
                break
        
        #normalize embeddings before using them
        #restore min embeddings
        embedding = tf.convert_to_tensor(min_emb_matrix)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims = True))
        normalized_embedding = embedding/norm
        normalized_embedding_matrix = sess.run(normalized_embedding)
        
        #measure total time
        total_time = time.time() - start_time
        print("training time in seconds %s "%(str(total_time)))
        print("total epochs was",epoch)
    return normalized_embedding_matrix
         

In [163]:
def check_neigh(prediction_buffer,buffer_len,cosine_sim,word):
    
    if len(prediction_buffer)< buffer_len:
        prediction_buffer.append((word,cosine_sim))
        quickSort(prediction_buffer,0,len(prediction_buffer)-1)
    else:
        if cosine_sim > prediction_buffer[buffer_len-1][1]:
            prediction_buffer.pop()
            prediction_buffer.append((word,cosine_sim))
            quickSort(prediction_buffer,0,buffer_len-1)

In [164]:
# the first version just computes fpr and tpr. Model's classes
# are class_A = neighbor and class_B = no neighbor. The model based on cosine similarity
# will try to predict the right label for each word pair given.
def model_validation_v1(embedding_matrix,words_dict,true_neigh):
    
    prediction_buffer = collections.deque(maxlen = true_neigh)
    tp = 0
    fn = 0
    fp = 0
    tn = 0
    for key in words_dict:
        target_emb = embedding_matrix[key]
        for neigh in words_dict[key][0]:
            neigh_emb = embedding_matrix[neigh[0]]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            check_neigh(prediction_buffer,true_neigh,result,neigh[0])
        for neigh in words_dict[key][1]:
            neigh_emb = embedding_matrix[neigh]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            check_neigh(prediction_buffer,true_neigh,result,neigh)
        
        fp_counter = 0
        for neigh in prediction_buffer:
            if neigh[0] in words_dict[key][1]:
                fp_counter += 1
        fp = fp + fp_counter
        tn = tn + (30-fp_counter)
        tp = tp + (8-fp_counter)
        fn = fn + fp_counter
        
    tpr = tp/(tp+fn)
    fpr = fp/(tn+fp)
    return tpr,fpr

In [165]:
norm_embedding_matrix = model_def_cpu(corpus,corpus_indexes,batch_size,embedding_dim,num_sampled,
                                      learning_rate,vocabulary_size)

np.savetxt('word_embeddings_test.txt',norm_embedding_matrix,fmt='%.8f')
model_validation_v1(norm_embedding_matrix,valid_dict,true_neigh)

training time in seconds 3.5796594619750977 
total epochs was 677


(0.9725, 0.007333333333333333)