### Importing required packages into python:

In [2]:
# Required dependencies
# 1. NLTK
# 2. Gensim for word2vec
# 3. Keras with tensorflow/theano backend


import numpy as np
np.random.seed(1337)
import json, re, nltk, string, csv, sys, codecs
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from keras import layers
from keras.optimizers import RMSprop
from keras.utils import np_utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics.pairwise import cosine_similarity


# Hack to increase size due to Error: field larger than field limit (131072)
maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10
    # as long as the OverflowError occurs.

    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt / 10)
        decrement = True
        
open_bugs_csv = 'e1_open.csv'
closed_bugs_csv = 'm15_closed.csv'

#========================================================================================
# Initializing Hyper parameter
#========================================================================================
#1. Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5

#2. Classifier hyperparameters
numCV = 10
max_sentence_len = 50
min_sentence_length = 15
rankK = 10
batch_size = 32

#========================================================================================
# Preprocess the open bugs, extract the vocabulary and learn the word2vec representation
#========================================================================================
with open(open_bugs_csv) as data_file:
    data = csv.reader(data_file, delimiter=';')

    all_data = []
    for item in data:
        #1. Remove \r 
        current_title = unicode(item[1], errors='ignore').replace('\r', ' ')
        #print current_title
        current_desc = unicode(item[3], errors='ignore').replace('\r', ' ')
        #print current_desc
        #2. Remove URLs
        current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
        #3. Remove Stack Trace
        start_loc = current_desc.find("Stack trace:")
        current_desc = current_desc[:start_loc]    
        #4. Remove hex code
        current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
        current_title= re.sub(r'(\w+)0x\w+', '', current_title)    
        #5. Change to lower case
        current_desc = current_desc.lower()
        current_title = current_title.lower()    
        #6. Tokenize
        current_desc_tokens = nltk.word_tokenize(current_desc)
        current_title_tokens = nltk.word_tokenize(current_title)
        #7. Strip trailing punctuation marks    
        current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
        current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]      
        #8. Join the lists
        current_data = current_title_filter + current_desc_filter
        current_data = filter(None, current_data)
        all_data.append(current_data)
        
#print(len(all_data))
# Learn the word2vec model and extract vocabulary
wordvec_model = Word2Vec(all_data, min_count=min_word_frequency_word2vec, size=embed_size_word2vec, window=context_window_word2vec)
vocabulary = wordvec_model.wv.vocab
#print vocabulary
vocab_size = len(vocabulary)

#========================================================================================
# Preprocess the closed bugs, using the extracted the vocabulary
#========================================================================================
with open(closed_bugs_csv) as data_file:
    data = csv.reader(data_file, delimiter=';')

    all_data = []
    all_owner = []    
    for item in data:
        #1. Remove \r 
        current_title = unicode(item[1], errors='ignore').replace('\r', ' ')
        current_desc = unicode(item[3], errors='ignore').replace('\r', ' ')
        #2. Remove URLs
        current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)
        #3. Remove Stack Trace
        start_loc = current_desc.find("Stack trace:")
        current_desc = current_desc[:start_loc]
        #4. Remove hex code
        current_desc = re.sub(r'(\w+)0x\w+', '', current_desc)
        current_title= re.sub(r'(\w+)0x\w+', '', current_title)
        #5. Change to lower case
        current_desc = current_desc.lower()
        current_title = current_title.lower()
        #6. Tokenize
        current_desc_tokens = nltk.word_tokenize(current_desc)
        current_title_tokens = nltk.word_tokenize(current_title)
        #7. Strip punctuation marks
        current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
        current_title_filter = [word.strip(string.punctuation) for word in current_title_tokens]       
        #8. Join the lists
        current_data = current_title_filter + current_desc_filter
        current_data = filter(None, current_data)
        all_data.append(current_data)
        all_owner.append(item[4])
        
#========================================================================================
# Split cross validation sets and perform deep learning + softamx based classification
#========================================================================================
totalLength = len(all_data)
splitLength = int(totalLength / (numCV + 1))

for i in range(1, numCV + 1):
    # Split cross validation set
    print("Starting work on cross validation set {0}".format(i))
    train_data = all_data[:i*splitLength-1]
    test_data = all_data[i*splitLength:(i+1)*splitLength-1]
    train_owner = all_owner[:i*splitLength-1]
    test_owner = all_owner[i*splitLength:(i+1)*splitLength-1]
    
    # Remove words outside the vocabulary
    updated_train_data = []    
    updated_train_data_length = []    
    updated_train_owner = []
    final_test_data = []
    final_test_owner = []
    for j, item in enumerate(train_data):
        current_train_filter = [word for word in item if word in vocabulary]
        if len(current_train_filter) >= min_sentence_length:  
          updated_train_data.append(current_train_filter)
          updated_train_owner.append(train_owner[j])  
          
    for j, item in enumerate(test_data):
        current_test_filter = [word for word in item if word in vocabulary]  
        if len(current_test_filter) >= min_sentence_length:
          final_test_data.append(current_test_filter)          
          final_test_owner.append(test_owner[j])          
    
    # Remove data from test set that is not there in train set
    train_owner_unique = set(updated_train_owner)
    test_owner_unique = set(final_test_owner)
    unwanted_owner = list(test_owner_unique - train_owner_unique)
    updated_test_data = []
    updated_test_owner = []
    updated_test_data_length = []
    for j in range(len(final_test_owner)):
        if final_test_owner[j] not in unwanted_owner:
            updated_test_data.append(final_test_data[j])
            updated_test_owner.append(final_test_owner[j])

    unique_train_label = list(set(updated_train_owner))
    classes = np.array(unique_train_label)
    
    # Create train and test data for deep learning + softmax
    X_train = np.empty(shape=[len(updated_train_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_train = np.empty(shape=[len(updated_train_owner), 1], dtype='int32')
    # 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
    for j, curr_row in enumerate(updated_train_data):
        sequence_cnt = 0         
        for item in curr_row:
            if item in vocabulary:
                X_train[j, sequence_cnt, :] = wordvec_model[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                          break                
        for k in range(sequence_cnt, max_sentence_len):
            X_train[j, k, :] = np.zeros((1, embed_size_word2vec))        
        Y_train[j, 0] = unique_train_label.index(updated_train_owner[j])
    
    X_test = np.empty(shape=[len(updated_test_data), max_sentence_len, embed_size_word2vec], dtype='float32')
    Y_test = np.empty(shape=[len(updated_test_owner),1], dtype='int32')
    # 1 - start of sentence, # 2 - end of sentence, # 0 - zero padding. Hence, word indices start with 3 
    for j, curr_row in enumerate(updated_test_data):
        sequence_cnt = 0          
        for item in curr_row:
            if item in vocabulary:
                X_test[j, sequence_cnt, :] = wordvec_model[item] 
                sequence_cnt = sequence_cnt + 1                
                if sequence_cnt == max_sentence_len-1:
                          break                
        for k in range(sequence_cnt, max_sentence_len):
            X_test[j, k, :] = np.zeros((1, embed_size_word2vec))        
        Y_test[j, 0] = unique_train_label.index(updated_test_owner[j])
        
    y_train = np_utils.to_categorical(Y_train, len(unique_train_label))
    y_test = np_utils.to_categorical(Y_test, len(unique_train_label))


    # TODO: Add x_train and x_test
    
    # Construct the deep learning model
    print("Creating Model")
    sequence = Input(shape=(max_sentence_len, embed_size_word2vec), dtype='float32')
    forwards_1 = LSTM(1024)(sequence)
    after_dp_forward_4 = Dropout(0.20)(forwards_1) 
    backwards_1 = LSTM(1024, go_backwards=True)(sequence)
    after_dp_backward_4 = Dropout(0.20)(backwards_1)         
    #merged = merge([after_dp_forward_4, after_dp_backward_4], mode='concat', concat_axis=-1)
    merged = layers.concatenate([after_dp_forward_4, after_dp_backward_4], axis=-1)
    after_dp = Dropout(0.5)(merged)
    output = Dense(len(unique_train_label), activation='softmax')(after_dp)                
    model = Model(input=sequence, output=output)            
    rms = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy', optimizer=rms, metrics=['accuracy'])    
    hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=20)  # Rename nb_epochs to epochs // Value original: 200
    
    predict = model.predict(X_test)        
    accuracy = []
    sortedIndices = []
    pred_classes = []
    if len(predict) == 0:
        exit(1)  # Avoid divide by zero
    for ll in predict:
          sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
    for k in range(1, rankK + 1):
          id = 0
          trueNum = 0
          for sortedInd in sortedIndices:
            pred_classes.append(classes[sortedInd[:k]])
            if y_test[id] in classes[sortedInd[:k]]:
                  trueNum += 1            
            id += 1
          accuracy.append((float(trueNum) / len(predict)) * 100)
    print("Test accuracy: ", accuracy)       
    
    train_result = hist.history        
    print(train_result)
    del model

    
#========================================================================================
# Split cross validation sets and perform baseline classifiers
#========================================================================================    
    
totalLength = len(all_data)
splitLength = totalLength / (numCV + 1)

for i in range(1, numCV+1):
    # Split cross validation set
    print("Starting cross validation {0}".format(i))
    train_data = all_data[:i*splitLength-1]
    test_data = all_data[i*splitLength:(i+1)*splitLength-1]
    train_owner = all_owner[:i*splitLength-1]
    test_owner = all_owner[i*splitLength:(i+1)*splitLength-1]
    
    # Remove words outside the vocabulary
    updated_train_data = []    
    updated_train_data_length = []    
    updated_train_owner = []
    final_test_data = []
    final_test_owner = []
    for j, item in enumerate(train_data):
        current_train_filter = [word for word in item if word in vocabulary]
        if len(current_train_filter)>=min_sentence_length:  
          updated_train_data.append(current_train_filter)
          updated_train_owner.append(train_owner[j])  
          
    for j, item in enumerate(test_data):
        current_test_filter = [word for word in item if word in vocabulary]  
        if len(current_test_filter)>=min_sentence_length:
          final_test_data.append(current_test_filter)          
          final_test_owner.append(test_owner[j])          
    
    # Remove data from test set that is not there in train set
    train_owner_unique = set(updated_train_owner)
    test_owner_unique = set(final_test_owner)
    unwanted_owner = list(test_owner_unique - train_owner_unique)
    updated_test_data = []
    updated_test_owner = []
    updated_test_data_length = []
    for j in range(len(final_test_owner)):
        if final_test_owner[j] not in unwanted_owner:
            updated_test_data.append(final_test_data[j])
            updated_test_owner.append(final_test_owner[j])  
    
    train_data = []
    for item in updated_train_data:
          train_data.append(' '.join(item))
         
    test_data = []
    for item in updated_test_data:
          test_data.append(' '.join(item))
    
    vocab_data = []
    for item in vocabulary:
          vocab_data.append(item)
    
    # Extract tf based bag of words representation
    tfidf_transformer = TfidfTransformer(use_idf=False)
    count_vect = CountVectorizer(min_df=1, vocabulary= vocab_data,dtype=np.int32)
    
    train_counts = count_vect.fit_transform(train_data)       
    train_feats = tfidf_transformer.fit_transform(train_counts)
    print(train_feats.shape)
    
    test_counts = count_vect.transform(test_data)
    test_feats = tfidf_transformer.transform(test_counts)
    print(test_feats.shape)
    print("=" * 20)
    
    
    
    # perform classifification
    for classifier in range(1,5):
        #classifier = 3 # 1 - Niave Bayes, 2 - Softmax, 3 - cosine distance, 4 - SVM
        print classifier 
        if classifier == 1:            
            classifierModel = MultinomialNB(alpha=0.01)        
            classifierModel = OneVsRestClassifier(classifierModel).fit(train_feats, updated_train_owner)
            predict = classifierModel.predict_proba(test_feats)  
            classes = classifierModel.classes_  
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                                    
        elif classifier == 2:            
            classifierModel = LogisticRegression(solver='lbfgs', penalty='l2', tol=0.01)
            classifierModel = OneVsRestClassifier(classifierModel).fit(train_feats, updated_train_owner)
            predict = classifierModel.predict(test_feats)
            classes = classifierModel.classes_ 
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                                   
        elif classifier == 3:            
            predict = cosine_similarity(test_feats, train_feats)
            classes = np.array(updated_train_owner)
            classifierModel = []
            
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                        
        elif classifier == 4:            
            classifierModel = svm.SVC(probability=True, verbose=False, decision_function_shape='ovr', random_state=42)
            classifierModel.fit(train_feats, updated_train_owner)
            predict = classifierModel.predict(test_feats)
            classes = classifierModel.classes_ 
        
            accuracy = []
            sortedIndices = []
            pred_classes = []
            for ll in predict:
                sortedIndices.append(sorted(range(len(ll)), key=lambda ii: ll[ii], reverse=True))
            for k in range(1, rankK+1):
                id = 0
                trueNum = 0
                for sortedInd in sortedIndices:            
                    if updated_test_owner[id] in classes[sortedInd[:k]]:
                        trueNum += 1
                        pred_classes.append(classes[sortedInd[:k]])
                    id += 1
                accuracy.append((float(trueNum) / len(predict)) * 100)
            print accuracy                            
    

Starting work on cross validation set 1
Creating Model




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.0540540556649904, 0.09459459499732868, 0.10810810810810811, 0.16216216256489624, 0.18918918999465736, 0.1891891891891892, 0.20270270350817088, 0.29729729749866435, 0.337837838643306, 0.41891892052985524, 0.2837837839851508, 0.445945946147313, 0.3918918935028282, 0.37837837878111247, 0.5810810843029538, 0.5675675659566313, 0.6891891924110619, 0.6621621605512258, 0.48648648487555013, 0.7162162146052798], 'loss': [4.840971998266272, 4.016664157042632, 3.3839768461278967, 3.0930660737527385, 2.930050347302411, 2.790204009494266, 2.6332499014364705, 2.440565895389866, 2.4735961347012907, 2.1940622523024276, 2.2294045461190715, 1.9372276099952492, 2.0618390199300407, 2.6910083487227157, 1.7237013288446374, 1.5644380208608266, 1.379394125294041, 1.2679926221435134, 1.862831473350525, 1.1224404509003099]}
Starting work on cross validation set 2
Creating Model
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/2

Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.026578073089700997, 0.05315614642692959, 0.05315614642692959, 0.11960132902741828, 0.10631229235880399, 0.1229235885349223, 0.13953488377043574, 0.172757475132562, 0.21926910311379702, 0.27242524941696283, 0.3189368771259175, 0.37209302375086917, 0.5016611296671174, 0.5747508308618172, 0.6179402003256586, 0.6877076415920574, 0.7973421934831182, 0.7774086384678204, 0.8970099669754307, 0.9102990037183033], 'loss': [4.8891635099518735, 4.260608612104904, 4.105057171412876, 3.91272412265258, 3.9369022299680996, 3.6524393376321886, 3.509311457409019, 3.290358064182573, 3.1186120470496905, 2.8486707345195783, 2.57597946407787, 2.3265996248619105, 1.9431716231412666, 1.6970903485320334, 1.3644813128880091, 1.2375905420693052, 0.8851172401263468, 0.8464128642383207, 0.5823763041599248, 0.4602829875344058]}
Starting work on cross validation set 5
Creating Model
Epoc

Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.041666666666666664, 0.06439393939393939, 0.056818181818181816, 0.08143939393939394, 0.10037878787878787, 0.10037878787878787, 0.13446969696969696, 0.16098484848484848, 0.25757575757575757, 0.2689393939393939, 0.3522727272727273, 0.4393939393939394, 0.5246212121212122, 0.5946969696969697, 0.6950757575757576, 0.7878787878787878, 0.8409090909090909, 0.8825757575757576, 0.9090909090909091, 0.9356060606060606], 'loss': [4.691659566127893, 4.273039528817842, 4.100989876371441, 3.994010217262037, 3.8480561429804023, 3.677702860398726, 3.502653945576061, 3.301510529084639, 2.9749556310249097, 2.775935895515211, 2.4949935855287495, 2.1837160226070518, 1.8560212091966108, 1.584432688626376, 1.2456301179799167, 0.9347401640631936, 0.7085631345257615, 0.5665768908731865, 0.44363827416391083, 0.3772876696153120

Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
('Test accuracy: ', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
{'acc': [0.04509283820616788, 0.08090185677380714, 0.06763925741300658, 0.09283819640504903, 0.1140583554475472, 0.11140583555364798, 0.16180371353773287, 0.19098143238050552, 0.23209549073594318, 0.2625994695059026, 0.3302387271066559, 0.39655172421698226, 0.4814323610589106, 0.5623342177437851, 0.6790450933125038, 0.7427055707660847, 0.8236074265813954, 0.8726790455671457, 0.8992042435575227, 0.9217506631299734], 'loss': [4.636089188350606, 4.1864450135977265, 4.100213534003544, 3.9483115590851883, 3.843667171361908, 3.704173157006423, 3.5241424765447724, 3.3454189167731005, 3.1056937987987814, 2.8894043645428726, 2.5853507834973324, 2.2551933884304463, 1.9250771392245507, 1.6565946185620457, 1.3064601943094472, 1.0790650809791422, 0.792349

[0.0, 0.0, 0.0, 2.941176470588235, 4.411764705882353, 5.88235294117647, 7.352941176470589, 7.352941176470589, 7.352941176470589, 7.352941176470589]
3
[2.941176470588235, 4.411764705882353, 10.294117647058822, 11.76470588235294, 11.76470588235294, 14.705882352941178, 17.647058823529413, 17.647058823529413, 20.588235294117645, 20.588235294117645]
4
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
