In [1]:
# import labriares
import os 
import json 
import nltk
import random
import string
import pickle
import collections
import numpy as np
from random import seed
from random import randint
from datetime import datetime
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
import math
import time
import numpy as np
from sklearn import metrics
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()

In [3]:
def load_data(descriptions,dir_path):
    
    counter        = 0
    counter_issues = 0
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:
            
            ##############################
            counter += 1
            #print(counter,") reading file",fname)
            ##############################
            
            #load data in json format
            data = json.load(json_file)
            for p in data:
                
                ##############################
                issue_name     = p['name']
                counter_issues += 1
                #print("  ",counter_issues,")",issue_name)
                ##############################
                
                issue_desc     = p['description']
                
                # add all non empty issues and non dublicate.
                if issue_desc != [] and issue_desc not in descriptions:
                    descriptions.append(issue_desc)


In [4]:
def clean_data(clean_descriptions,raw_descriptions):
    
    # define stop words
    all_stopwords = set(stopwords.words('english'))
    
    # define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    for desc in raw_descriptions:
        
        #join all lines into one sentence
        sentence = ' '.join(desc)
        
        #translate punctuation
        new_sentence = sentence.translate(translator)
        
        #split the sentense in words
        words = new_sentence.split()
        words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
        
        if words_sw != []:
            clean_descriptions.append(words_sw)
        

In [5]:
def stemming_data(descriptions):
    
    stemmer = PorterStemmer()
    
    for desc in descriptions:
        for counter in range(len(desc)):
            if desc[counter].isalpha():
                desc[counter] = stemmer.stem(desc[counter])
   

In [6]:
def split_dataset(descriptions,valid_size,test_size,min_size):
    
    valid_set = []
    test_set  = []
    
    # random select descriptions.
    seed(datetime.now())
    
    for i in range(valid_size):
        flag = False
        while flag == False:
            temp = randint(0,len(descriptions)-1)
            if len(descriptions[temp]) >= min_size:
                valid_set.append(descriptions.pop(temp))
                flag = True
    
    for i in range(test_size):
        flag = False
        while flag == False:
            temp = randint(0,len(descriptions)-1)
            if len(descriptions[temp]) >= min_size:
                test_set.append(descriptions.pop(temp))
                flag = True
    
    return valid_set,test_set


In [7]:
#the first time the below command should run to download stopwords
#nltk.download('stopwords')

# define necessary parameters
dir_path         = '../elastic_search'
raw_descriptions = []
min_size         = 10

# load all issues descriptions
load_data(raw_descriptions,dir_path)

# split and clean descriptions
clean_descriptions = []
clean_data(clean_descriptions,raw_descriptions)

# list raw_descriptions now is useless
del raw_descriptions

# stemming, it's not necessary step.
stemming_data(clean_descriptions)

# split data set to train,validation and test set
# validation and test set would have 20% of total data.
total_desc = len(clean_descriptions)
valid_size = int(0.3  * total_desc)
test_size  = int(0.1  * total_desc)

valid_set,test_set = split_dataset(clean_descriptions,valid_size,test_size,min_size)


In [8]:
# print messages #
print("total unique descriptions",total_desc)
print("size of train set",len(clean_descriptions))
print("size of validation set",valid_size)
print("size of test set",test_size)


total unique descriptions 9885
size of train set 5932
size of validation set 2965
size of test set 988


In [9]:
def save_vocabulary(word_dict):
    directory = "../outputs_project_2"
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    with open(os.path.join(directory,"words_vocabulary.txt"),"w") as file:
        for key in word_dict:
            file.write("%s, %s \n"%(key,str(word_dict[key])))
            

In [10]:
def save_test_pairs(test_dict):
    with open('../outputs_project_2/testing_pairs_test.pkl','wb') as file:
        pickle.dump(test_dict,file,pickle.HIGHEST_PROTOCOL)

In [11]:
def create_corpus(train_set,skip_window):
    
    # find total words in descriptions
    total_words = 0
    for desc in train_set:
        total_words += len(desc)
    
    # initialize the corpus which will keep all word pairs
    max_size = total_words*2*skip_window
    corpus = -1*np.ones((max_size,2), dtype=np.int32)
    
    # initialize pointers for the iterations
    desc_pointer  = 0
    word_pointer  = 0
    counter       = 0
    
    # initialize temporary buffer
    span   = 2*skip_window+1 
    buffer = collections.deque(maxlen = span)
    
    while counter < max_size:
        
        # avoid tags with -2
        while train_set[desc_pointer][word_pointer] < 0:
            word_pointer += 1
            if word_pointer > len(train_set[desc_pointer])-1:
                word_pointer  = 0
                desc_pointer +=1
                if desc_pointer > len(train_set) -1:
                    break
                    
        #check if all descriptions have been analyzed
        if desc_pointer > len(train_set)-1:
            break
        
        find_context_words(train_set[desc_pointer],word_pointer,skip_window,span,buffer)
        
        for i in range(1,len(buffer)):
            corpus[counter][0] = buffer[0]
            corpus[counter][1] = buffer[i]
            counter += 1
        
        buffer.clear()
        
        if word_pointer == len(train_set[desc_pointer]) -1:
            word_pointer  = 0
            desc_pointer +=1
            if desc_pointer > len(train_set) -1:
                break
        else:
            word_pointer += 1
    
    return corpus[0:counter].copy()


In [12]:
def find_context_words(description,word_index,skip_window,span,grams_list):
    
    # the target word in the first place
    grams_list.append(description[word_index])
    
    # initialize two pointers
    counter = 1
    data_index = word_index-1
    
    while counter < span:
        # look left from target word
        if counter<=skip_window:
            # if data_index<0 => out of bound no more words to take into account
            if data_index < 0:
                data_index = word_index + 1
                counter = skip_window + 1
            # if the word is not in the dict skip it
            elif description[data_index] == -2:
                data_index -= 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index -= 1
                if counter > skip_window:
                    data_index = word_index + 1
        # look right from target word
        else:
            if data_index >= len(description):
                counter = span + 1
            elif description[data_index] == -2:
                data_index += 1
            else:
                grams_list.append(description[data_index])
                counter    += 1
                data_index += 1
                

In [13]:
def create_dict(clean_descriptions, min_occurance, unk_word, skip_window, valid_set, test_set):
    
    # create vocabulary based on the frequency of each word.
    # remove rare words, which occurs less time than min_occurance from voc
    # word2id:  dictionary which contains the vocabulary and it's int id
    
    temp_sentences = [word for desc in clean_descriptions for word in desc]
    count = []
    count.extend(collections.Counter(temp_sentences).most_common())
    
    # list temp_sentences now is useless
    del temp_sentences
    count[:] = [e for e in count if e[1]>=min_occurance]
    
    # compute the vocabulary size
    vocabulary_size = len(count)
    
    # assign an id to each word
    # this dictionary will have voc_size+1 length.
    word2id           = dict()
    word2id[unk_word] = -2
    
    for i,(word,_) in enumerate(count):
        word2id[word] = i
        
    # list count now is useless
    del count
    
    # express train, valid and test set using id
    train_set_id = [[word2id.get(word,-2) for word in desc] for desc in clean_descriptions]
    #del clean_descriptions
    
    valid_set_id = [[word2id.get(word,-2) for word in desc] for desc in valid_set]
    #del valid_set
    
    test_set_id  = [[word2id.get(word,-2) for word in desc] for desc in test_set]
    #del test_set
    
    # save vocabulary
    save_vocabulary(word2id)
    
    # create corpus with word pairs
    corpus         = create_corpus(train_set_id,skip_window)
    corpus_indexes = [w for w in range(len(corpus))] 
    
    # save them 
    np.savetxt('../outputs_project_2/corpus_words_test.txt',corpus,fmt="%d")
    
    # train_set_id now is useless
    #del train_set_id
    
    return word2id,vocabulary_size,corpus,corpus_indexes,train_set_id,valid_set_id,test_set_id

In [14]:
def create_testing_dict(test_set,min_occurance,num_words,num_words2,skip_window,true_neigh,false_neigh):
    
    # numerate all words in the dataset.
    temp_sentences = [word for desc in test_set for word in desc]
    count = []
    count.extend(collections.Counter(temp_sentences).most_common())
    
    # list temp_sentences now is useless
    del temp_sentences
    
    # remove rare words
    count[:] = [e for e in count if e[1]>=min_occurance]
    indexes  = [i for i in range(len(count)) if count[i][0] != -2]
    
    # split validation set into two sets one small used for cross entropy computation
    # and the other at the end to meassure results.
    if num_words2>0:
        
        samples2  = np.random.choice(indexes,num_words2,replace = False)
        target_w2 = [count[i][0] for i in samples2]
        w_dict2   = create_testing_pairs(test_set,count,target_w2,indexes,skip_window,true_neigh,false_neigh)
        
        # test on the "num_words" most frequent words
        tmp_indexes = [i for i in indexes if i not in samples2]
        target_w    = [count[tmp_indexes[i]][0] for i in range(num_words)]
        w_dict      = create_testing_pairs(test_set,count,target_w,indexes,skip_window,true_neigh,false_neigh)
        del tmp_indexes
        return w_dict2,w_dict
    
    else:
        # test on the "num_words" most frequent words
        target_w = [count[indexes[i]][0] for i in range(num_words)]
        w_dict   = create_testing_pairs(test_set,count,target_w,indexes,skip_window,true_neigh,false_neigh)
        return None,w_dict


In [15]:
def create_testing_pairs(test_set,count,target_w,indexes,skip_window,true_neigh,false_neigh):
    
    # initialize temporary buffer
    span   = skip_window*2+1
    buffer = collections.deque(maxlen = span)
    
    # initialize dictionary
    w_dict   = dict([(key, [[],[]]) for key in target_w])
    
    # find true neighbors for target words
    for desc in test_set:
        for w in target_w:
            temp_idx = [i for i,e in enumerate(desc) if w == e]
            for idx in temp_idx:
                find_context_words(desc,idx,skip_window,span,buffer)
                for i in range(1,len(buffer)):
                    if w_dict[w][0] == []:
                        w_dict[w][0].append(buffer[i])
                    elif buffer[i] not in w_dict[w][0]:
                        w_dict[w][0].append(buffer[i])
    
    # find false neigbors for target words
    for key in w_dict:
        neig_counter = 0
        flag         = True
        while flag  == True:
            random_idx   = np.random.choice(indexes,2*false_neigh,replace = False)
            for idx in random_idx:
                if count[idx][0] == key:
                    continue
                elif count[idx][0] in w_dict[key][0]:
                    continue
                elif count[idx][0] not in w_dict[key][1]:
                    w_dict[key][1].append(count[idx][0])
                    neig_counter += 1
                    if neig_counter >= false_neigh:
                        flag = False
                        break
    
    # choose randomly only true_neigh neighbors.
    removed_keys = []
    for key in w_dict:
        if len(w_dict[key][0])>=true_neigh:
            idx_neigh =  np.random.choice([i for i in range(len(w_dict[key][0]))],true_neigh,replace = False)
            w_dict[key][0] = [w_dict[key][0][i] for i in idx_neigh]
        else:
            removed_keys.append(key)
            
    if removed_keys != []:
        for key in removed_keys:
            w_dict.pop(key)
    return w_dict

In [16]:
def generate_batch(corpus_data,corpus_indexes,batch_size):
    
    batch  = np.ndarray(shape = (batch_size),   dtype = np.int32)
    labels = np.ndarray(shape = (batch_size,1), dtype = np.int32)
    
    seed(datetime.now())
    
    words_to_use = random.sample(corpus_indexes,batch_size)
    
    for counter,value in enumerate(words_to_use):
        batch[counter]    = corpus_data[value][0]
        labels[counter,0] = corpus_data[value][1] 
    
    return batch,labels


In [17]:
def model_def_cpu(corpus_data,corpus_indexes,batch_size,embedding_dim,
                  num_sampled,learning_rate,vocabulary_size,v_batch,v_labels):
    
    # Input data
    X_train = tf.placeholder(tf.int32, shape=[None])
    # Input label
    Y_train = tf.placeholder(tf.int32, shape=[None, 1])
    
    # patience step 
    step = 10*2*skip_window*batch_size/len(corpus_indexes)
    print("step is ", step)
    
    # ensure that the following ops & var are assigned to CPU
    with tf.device('/cpu:0'):
    
        # create the embedding variable wich contains the weights
        embedding = tf.Variable(tf.random_normal([vocabulary_size,embedding_dim]))
        
        # create the lookup table for each sample in X_train=>avoiding to use one_hot encoder
        X_embed   = tf.nn.embedding_lookup(embedding,X_train)
        
        # create variables for the loss function
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size,embedding_dim],stddev=1.0))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    loss_func = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,biases =nce_biases,labels = Y_train,
                                              inputs = X_embed,num_sampled = num_sampled,
                                              num_classes = vocabulary_size ))
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_opt = optimizer.minimize(loss_func)
    
    #Define initializer for tensorflow variables
    init = tf.global_variables_initializer()
    
    with tf.Session() as sess:
        
        #actual initialize the variables
        sess.run(init)
        
        # patience method's variables 
        min_loss           = float('inf')
        min_emb_matrix     = np.zeros((vocabulary_size,embedding_dim))
        patience_remaining = 100
        
        start_time = time.time()
        # train the model using 100 epoch patience
        for epoch in range(50000):
            
            # take a batch of data.
            batch_x,batch_y = generate_batch(corpus_data,corpus_indexes,batch_size)
            
            _,train_loss = sess.run([train_opt,loss_func],feed_dict={X_train:batch_x, Y_train:batch_y})
            valid_loss   = sess.run(loss_func,feed_dict={X_train:v_batch, Y_train:v_labels})
            
            patience_remaining -= step
            if valid_loss < min_loss:
                min_loss           = valid_loss
                patience_remaining = 100
                min_emb_matrix     = embedding.eval()
            if patience_remaining <= 0:
                break
        
        #restore min embeddings
        embedding = tf.convert_to_tensor(min_emb_matrix)
        
        #normalize embeddings before using them
        norm = tf.sqrt(tf.reduce_sum(tf.square(embedding),1,keepdims = True))
        normalized_embedding = embedding/norm
        normalized_embedding_matrix = sess.run(normalized_embedding)
        
        #measure total time
        total_time = time.time() - start_time
        
    
    return normalized_embedding_matrix,epoch+1,total_time
         

In [18]:
# The model computes tpr, fpr and auc. The classes are class_A = real neighbor
# and class_B = false neighbor. The model based on cosine similarity
# will try to predict the right label for each word pair given.

def model_validation_v2(embedding_matrix,words_dict):
    
    ylabels = list()
    ypreds  = list()
    
    for key in words_dict:
        target_emb = embedding_matrix[key]
        for true_neigh in words_dict[key][0]:
            neigh_emb = embedding_matrix[true_neigh]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            ylabels.append(1)
            ypreds.append(result)
            
        for false_neigh in words_dict[key][1]:
            neigh_emb = embedding_matrix[false_neigh]
            result    = np.dot(target_emb,neigh_emb)/(np.sqrt(np.dot(target_emb,target_emb))*np.sqrt(np.dot(neigh_emb,neigh_emb)))
            ylabels.append(0)
            ypreds.append(result)
    
    y = np.array(ylabels)
    score = np.array(ypreds)
    fpr,tpr,thresholds = metrics.roc_curve(y,score)
    auc = metrics.auc(fpr,tpr)
    return auc

In [19]:
def save_logs(epoch,p_time,auc,t_auc,min_occurance,skip_window,embedding_dim,num_sampled,learning_rate):
    with open("../outputs_project_2/logs.txt","a") as file:
        file.write("parameter's value: min occurance %s, skip window %s, embedding dim %s, num sampled %s, learning rate %s \n"%(str(min_occurance),str(skip_window),str(embedding_dim),str(num_sampled),str(learning_rate)))
        file.write("total time in sec %s and total epochs %s \n"%(str(p_time),str(epoch)))
        file.write("Validation AUC: %s \n"%(str(auc)))
        file.write("Testing AUC %s \n"%(str(t_auc)))
        

In [20]:
unk_word      = "UNK"
valid_words   = 80
valid_words2  = 70
test_words    = 100
true_neigh    = 8
false_neigh   = 30
batch_size    = 2048
for min_occurance in [4,5,6,7,8,9,10]:
    for skip_window in [1,2,3]:

        word2id,vocabulary_size,corpus,corpus_indexes,train_set_id,valid_set_id,test_set_id = create_dict(clean_descriptions, min_occurance, unk_word, skip_window, valid_set, test_set)

        _,test_dict  = create_testing_dict(test_set_id,5,test_words,0,2,true_neigh,false_neigh)
        save_test_pairs(test_dict)
        del test_dict

        v_dict2,v_dict = create_testing_dict(valid_set_id,5,valid_words,valid_words2,2,true_neigh,false_neigh)
        t_batch  = []
        t_label  = []

        for key in v_dict2:
            for value in v_dict2[key][0]:
                t_batch.append(key)
                t_label.append(value)

        v_batch = np.reshape(t_batch,(len(t_batch),))
        v_label = np.reshape(t_label,(len(t_label),1))

        for embedding_dim in [32,64,128]:
            for num_sampled in [4,8,16,32,64]:
                for learning_rate in [0.01,0.1]:

                    norm_embedding_matrix,epoch,p_time = model_def_cpu(corpus,corpus_indexes,batch_size,
                                                                     embedding_dim,num_sampled,learning_rate,
                                                                     vocabulary_size,v_batch,v_label)

                    auc = model_validation_v2(norm_embedding_matrix,v_dict)

                    # unpickling test dictionary
                    with open('../outputs_project_2/testing_pairs_test.pkl','rb') as infile:
                        testing_dict = pickle.load(infile)

                    t_auc = model_validation_v2(norm_embedding_matrix,testing_dict)
                    save_logs(epoch,p_time,auc,t_auc,min_occurance,skip_window,embedding_dim,num_sampled,learning_rate)
                    print("finished with pm ",min_occurance,skip_window,embedding_dim,num_sampled,learning_rate)
            
            time.sleep(60)
        time.sleep(60)
        
        os.remove("../outputs_project_2/corpus_words_test.txt")
        os.remove("../outputs_project_2/testing_pairs_test.pkl")
        os.remove("../outputs_project_2/words_vocabulary.txt")


step is  0.03247798458881571
finished with pm  3 2 32 4 0.01
step is  0.03247798458881571
finished with pm  3 2 32 4 0.1
step is  0.03247798458881571
finished with pm  3 2 32 8 0.01
step is  0.03247798458881571
finished with pm  3 2 32 8 0.1
step is  0.03247798458881571
finished with pm  3 2 32 16 0.01
step is  0.03247798458881571
finished with pm  3 2 32 16 0.1
step is  0.03247798458881571
finished with pm  3 2 32 32 0.01
step is  0.03247798458881571
finished with pm  3 2 32 32 0.1
step is  0.03247798458881571
finished with pm  3 2 32 64 0.01
step is  0.03247798458881571
finished with pm  3 2 32 64 0.1
step is  0.03247798458881571
finished with pm  3 2 64 4 0.01
step is  0.03247798458881571
finished with pm  3 2 64 4 0.1
step is  0.03247798458881571
finished with pm  3 2 64 8 0.01
step is  0.03247798458881571
finished with pm  3 2 64 8 0.1
step is  0.03247798458881571
finished with pm  3 2 64 16 0.01
step is  0.03247798458881571
finished with pm  3 2 64 16 0.1
step is  0.0324779845888

KeyboardInterrupt: 